In [1]:
import pandas as pd
import numpy as np

#to plot the data
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
%matplotlib inline
In [2]:
# Input files don't have column names
dependent_var = ['RUL']
index_columns_names = ["UnitNumber","Cycle"]
operational_settings_columns_names = ['mode1', 'mode2', 'mode3']
sensor_measure_columns_names =['sensor{}'.format(i) for i in range(1, 22)] 
input_file_column_names = index_columns_names + operational_settings_columns_names + sensor_measure_columns_names
In [3]:
#Importing train dataset and calculating remaining useful life



df_train = pd.read_csv('C:\\Users\\Madhu Shree\\OneDrive\\Desktop\\Generative Model\\data\\CMAPSSData\\train_FD001.txt',delim_whitespace=True,names=input_file_column_names)

rul = pd.DataFrame(df_train.groupby('UnitNumber')['Cycle'].max()).reset_index()
rul.columns = ['UnitNumber', 'max']
df_train = df_train.merge(rul, on=['UnitNumber'], how='left')
df_train['RUL'] = df_train['max'] - df_train['Cycle']
df_train.drop('max', axis=1, inplace=True)
C:\Users\Madhu Shree\AppData\Local\Temp\ipykernel_9432\2559281974.py:5: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\s+'`` instead
  df_train = pd.read_csv('C:\\Users\\Madhu Shree\\OneDrive\\Desktop\\Generative Model\\data\\CMAPSSData\\train_FD001.txt',delim_whitespace=True,names=input_file_column_names)
In [4]:
df_train.head()
Out[4]:
UnitNumber Cycle mode1 mode2 mode3 sensor1 sensor2 sensor3 sensor4 sensor5 ... sensor13 sensor14 sensor15 sensor16 sensor17 sensor18 sensor19 sensor20 sensor21 RUL
0 1 1 -0.0007 -0.0004 100.0 518.67 641.82 1589.70 1400.60 14.62 ... 2388.02 8138.62 8.4195 0.03 392 2388 100.0 39.06 23.4190 191
1 1 2 0.0019 -0.0003 100.0 518.67 642.15 1591.82 1403.14 14.62 ... 2388.07 8131.49 8.4318 0.03 392 2388 100.0 39.00 23.4236 190
2 1 3 -0.0043 0.0003 100.0 518.67 642.35 1587.99 1404.20 14.62 ... 2388.03 8133.23 8.4178 0.03 390 2388 100.0 38.95 23.3442 189
3 1 4 0.0007 0.0000 100.0 518.67 642.35 1582.79 1401.87 14.62 ... 2388.08 8133.83 8.3682 0.03 392 2388 100.0 38.88 23.3739 188
4 1 5 -0.0019 -0.0002 100.0 518.67 642.37 1582.85 1406.22 14.62 ... 2388.04 8133.80 8.4294 0.03 393 2388 100.0 38.90 23.4044 187

5 rows × 27 columns

In [5]:
#Importing Test dataset

df_test = pd.read_csv('C:\\Users\\Madhu Shree\\OneDrive\\Desktop\\Generative Model\\data\\CMAPSSData\\test_FD001.txt', delim_whitespace=True, names=input_file_column_names)
df_test.head()
C:\Users\Madhu Shree\AppData\Local\Temp\ipykernel_9432\1464558967.py:3: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\s+'`` instead
  df_test = pd.read_csv('C:\\Users\\Madhu Shree\\OneDrive\\Desktop\\Generative Model\\data\\CMAPSSData\\test_FD001.txt', delim_whitespace=True, names=input_file_column_names)
Out[5]:
UnitNumber Cycle mode1 mode2 mode3 sensor1 sensor2 sensor3 sensor4 sensor5 ... sensor12 sensor13 sensor14 sensor15 sensor16 sensor17 sensor18 sensor19 sensor20 sensor21
0 1 1 0.0023 0.0003 100.0 518.67 643.02 1585.29 1398.21 14.62 ... 521.72 2388.03 8125.55 8.4052 0.03 392 2388 100.0 38.86 23.3735
1 1 2 -0.0027 -0.0003 100.0 518.67 641.71 1588.45 1395.42 14.62 ... 522.16 2388.06 8139.62 8.3803 0.03 393 2388 100.0 39.02 23.3916
2 1 3 0.0003 0.0001 100.0 518.67 642.46 1586.94 1401.34 14.62 ... 521.97 2388.03 8130.10 8.4441 0.03 393 2388 100.0 39.08 23.4166
3 1 4 0.0042 0.0000 100.0 518.67 642.44 1584.12 1406.42 14.62 ... 521.38 2388.05 8132.90 8.3917 0.03 391 2388 100.0 39.00 23.3737
4 1 5 0.0014 0.0000 100.0 518.67 642.51 1587.19 1401.92 14.62 ... 522.15 2388.03 8129.54 8.4031 0.03 390 2388 100.0 38.99 23.4130

5 rows × 26 columns

In [6]:
#Importing True RUL of engines of Test data



y_true = pd.read_csv('C:\\Users\\Madhu Shree\\OneDrive\\Desktop\\Generative Model\\data\\CMAPSSData\\RUL_FD001.txt',delim_whitespace=True,names=["RUL"])
y_true["UnitNumber"] = y_true.index
y_true.head()
C:\Users\Madhu Shree\AppData\Local\Temp\ipykernel_9432\567814418.py:5: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\s+'`` instead
  y_true = pd.read_csv('C:\\Users\\Madhu Shree\\OneDrive\\Desktop\\Generative Model\\data\\CMAPSSData\\RUL_FD001.txt',delim_whitespace=True,names=["RUL"])
Out[6]:
RUL UnitNumber
0 112 0
1 98 1
2 69 2
3 82 3
4 91 4

Data Analysis¶

In [7]:
#shape of the train data
df_train.shape
Out[7]:
(20631, 27)
In [8]:
#Finding the missing values
df_train.isnull().sum()
Out[8]:
UnitNumber    0
Cycle         0
mode1         0
mode2         0
mode3         0
sensor1       0
sensor2       0
sensor3       0
sensor4       0
sensor5       0
sensor6       0
sensor7       0
sensor8       0
sensor9       0
sensor10      0
sensor11      0
sensor12      0
sensor13      0
sensor14      0
sensor15      0
sensor16      0
sensor17      0
sensor18      0
sensor19      0
sensor20      0
sensor21      0
RUL           0
dtype: int64
In [9]:
#Unit
df_train.UnitNumber.unique()
Out[9]:
array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100], dtype=int64)
In [10]:
#Engines having maximum life

df_train[["UnitNumber","Cycle"]].groupby("UnitNumber").max().sort_values(by = ["Cycle"], ascending= False).head(5)
Out[10]:
Cycle
UnitNumber
69 362
92 341
96 336
67 313
83 293

Mode Setting¶

In [11]:
#Visualizing Mode Setting of units 1 to 15.
In [12]:
fig,ax=plt.subplots(1,3,figsize=(30,8),sharex='all')
for i in range(0,3):
    df_u1=df_train.query('UnitNumber==2').reset_index(drop = True)
    df_u1['rolling_avg'] = df_u1['mode'+str(i+1)].rolling(window = 5).mean()
    df_u1['rolling_avg'].plot(kind = "line", ax = ax[i])
    ax[i].set_title('mode'+str(i+1))
    ax[i].set_xlabel("Cycle")
In [13]:
#Mode1

fig, axes = plt.subplots(nrows = 5, ncols = 3, figsize = (25,25))

ax = axes.ravel()

for i in range(0,15):
    temp = df_train.mode1[df_train.UnitNumber == i+1].reset_index(drop = True).rolling(window = 5).mean()
    temp.plot(kind = "line", ax = ax[i])
    ax[i].set_title('Unit'+str(i+1))
    ax[i].set_xlabel("Cycle")
In [14]:
#Mode2

fig, axes = plt.subplots(nrows = 5, ncols = 3, figsize = (25,25))

ax = axes.ravel()

for i in range(0,15):
    temp = df_train.mode2[df_train.UnitNumber == i+1].reset_index(drop = True).rolling(window = 5).mean()
    temp.plot(kind = "line", ax = ax[i])
    ax[i].set_title('Unit'+str(i+1))
    ax[i].set_xlabel("Cycle")
In [15]:
#Mode3

fig, axes = plt.subplots(nrows = 5, ncols = 3, figsize = (25,25))

ax = axes.ravel()

for i in range(0,15):
    temp = df_train.mode3[df_train.UnitNumber == i+1].reset_index(drop = True).rolling(window = 5).mean()
    temp.plot(kind = "line", ax = ax[i])
    ax[i].set_title('Unit'+str(i+1))
    ax[i].set_xlabel("Cycle")
In [16]:
fig,ax=plt.subplots(7,3,figsize=(30,20),sharex=True)
df_u1=df_train.query('UnitNumber==5')
c=0
for i in range(0,7):
    for j in range(0,3):
        ax[i,j].plot(df_u1.Cycle.values, df_u1['sensor'+str(c+1)])
        ax[i,j].set_title('sensor'+str(c+1))
        ax[i,j].axvline(0,c='r')
        c+=1
plt.suptitle('Sensor Traces: Unit 5',fontsize=50)
plt.show()
In [17]:
#Sensor Measure 6

fig, axes = plt.subplots(nrows = 5, ncols = 3, figsize = (25,25))

ax = axes.ravel()

for i in range(0,15):
    temp = df_train.sensor6[df_train.UnitNumber == i+1].reset_index(drop = True)
    temp.plot(kind = "line", ax = ax[i])
    ax[i].set_title('Unit'+str(i+1))
    ax[i].set_xlabel("Cycle")
In [18]:
# necessary features for analysis
not_required_feats = ["sensors1", "sensors5", "sensors6", "sensors10", 
                      "sensors16", "sensors18", "sensors19"]
feats = [feat for feat in sensor_measure_columns_names if feat not in not_required_feats]
feats
Out[18]:
['sensor1',
 'sensor2',
 'sensor3',
 'sensor4',
 'sensor5',
 'sensor6',
 'sensor7',
 'sensor8',
 'sensor9',
 'sensor10',
 'sensor11',
 'sensor12',
 'sensor13',
 'sensor14',
 'sensor15',
 'sensor16',
 'sensor17',
 'sensor18',
 'sensor19',
 'sensor20',
 'sensor21']
In [19]:
#Correlation Analysis

corr = df_train[feats + ["RUL"]].corr()

fig = plt.figure(figsize=(12,12))
ax = fig.add_subplot(111)
ax = sns.heatmap(corr, annot=True, cmap = "coolwarm", fmt=".2f")
In [ ]: